import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
#Leemos los datos
Base_df = pd.read_csv('../data/Base.csv')
Base_df
| fraud_bool | income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.9 | 0.166828 | -1 | 88 | 50 | 0.020925 | -1.331345 | AA | 769 | 10650.765523 | 3134.319630 | 3863.647740 | 1 | 6 | CA | 185 | 0 | BA | 1 | 0 | 24 | 0 | 500.0 | 0 | INTERNET | 3.888115 | windows | 0 | 1 | 0 | 7 |
| 1 | 1 | 0.9 | 0.296286 | -1 | 144 | 50 | 0.005418 | -0.816224 | AB | 366 | 534.047319 | 2670.918292 | 3124.298166 | 718 | 3 | CA | 259 | 1 | BA | 0 | 0 | 15 | 0 | 1500.0 | 0 | INTERNET | 31.798819 | windows | 0 | 1 | 0 | 7 |
| 2 | 1 | 0.9 | 0.044985 | -1 | 132 | 40 | 3.108549 | -0.755728 | AC | 870 | 4048.534263 | 2893.621498 | 3159.590679 | 1 | 14 | CB | 177 | 1 | BA | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 4.728705 | other | 0 | 1 | 0 | 7 |
| 3 | 1 | 0.9 | 0.159511 | -1 | 22 | 50 | 0.019079 | -1.205124 | AB | 810 | 3457.064063 | 4054.908412 | 3022.261812 | 1921 | 6 | CA | 110 | 1 | BA | 0 | 1 | 31 | 1 | 200.0 | 0 | INTERNET | 2.047904 | linux | 0 | 1 | 0 | 7 |
| 4 | 1 | 0.9 | 0.596414 | -1 | 218 | 50 | 0.004441 | -0.773276 | AB | 890 | 5020.341679 | 2728.237159 | 3087.670952 | 1990 | 2 | CA | 295 | 1 | BA | 1 | 0 | 31 | 0 | 1500.0 | 0 | INTERNET | 3.775225 | macintosh | 1 | 1 | 0 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 999995 | 0 | 0.6 | 0.192631 | -1 | 104 | 40 | 0.030592 | -1.044454 | AB | 804 | 7905.711839 | 8341.468557 | 4972.635997 | 1 | 8 | CA | 75 | 1 | BC | 1 | 1 | 25 | 0 | 200.0 | 0 | INTERNET | 8.511502 | linux | 1 | 1 | 0 | 4 |
| 999996 | 0 | 0.8 | 0.322989 | 148 | 9 | 50 | 1.628119 | -1.409803 | AC | 3306 | 5391.470463 | 4955.170808 | 5022.728108 | 0 | 2 | CC | 154 | 1 | BC | 1 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 8.967865 | windows | 0 | 1 | 0 | 4 |
| 999997 | 0 | 0.8 | 0.879403 | -1 | 30 | 20 | 0.018563 | 34.692760 | AA | 1522 | 8063.102636 | 5670.654316 | 4377.196321 | 2023 | 6 | CF | 64 | 0 | BC | 0 | 1 | 11 | 0 | 200.0 | 0 | INTERNET | 8.195531 | other | 0 | 1 | 0 | 4 |
| 999998 | 0 | 0.9 | 0.762112 | -1 | 189 | 20 | 0.015352 | 94.661055 | AA | 1418 | 8092.641762 | 3982.582204 | 4394.803296 | 1678 | 6 | CA | 163 | 0 | BA | 1 | 0 | 28 | 0 | 500.0 | 0 | INTERNET | 4.336064 | windows | 1 | 1 | 0 | 4 |
| 999999 | 0 | 0.2 | 0.697452 | -1 | 321 | 20 | 2.655916 | 9.908499 | AA | 951 | 6169.630036 | 3695.308261 | 4352.334543 | 2 | 12 | CA | 36 | 1 | BE | 0 | 1 | 15 | 0 | 200.0 | 0 | INTERNET | 6.717022 | linux | 0 | 1 | 0 | 4 |
1000000 rows × 32 columns
print(Base_df.shape, Base_df.drop_duplicates().shape)
(1000000, 32) (1000000, 32)
En el dataframe hay 1000000 filas y 32 columnas (variables).También vemos que no hay duplicados dentro del mismo
Base_df.dtypes.to_dict()
{'fraud_bool': dtype('int64'),
'income': dtype('float64'),
'name_email_similarity': dtype('float64'),
'prev_address_months_count': dtype('int64'),
'current_address_months_count': dtype('int64'),
'customer_age': dtype('int64'),
'days_since_request': dtype('float64'),
'intended_balcon_amount': dtype('float64'),
'payment_type': dtype('O'),
'zip_count_4w': dtype('int64'),
'velocity_6h': dtype('float64'),
'velocity_24h': dtype('float64'),
'velocity_4w': dtype('float64'),
'bank_branch_count_8w': dtype('int64'),
'date_of_birth_distinct_emails_4w': dtype('int64'),
'employment_status': dtype('O'),
'credit_risk_score': dtype('int64'),
'email_is_free': dtype('int64'),
'housing_status': dtype('O'),
'phone_home_valid': dtype('int64'),
'phone_mobile_valid': dtype('int64'),
'bank_months_count': dtype('int64'),
'has_other_cards': dtype('int64'),
'proposed_credit_limit': dtype('float64'),
'foreign_request': dtype('int64'),
'source': dtype('O'),
'session_length_in_minutes': dtype('float64'),
'device_os': dtype('O'),
'keep_alive_session': dtype('int64'),
'device_distinct_emails_8w': dtype('int64'),
'device_fraud_count': dtype('int64'),
'month': dtype('int64')}
A continuación, mostramos el recuento de valores únicos que existen dentro de "fraud_bool", también en porcentaje
Base_df_plot_fraud_bool_status = Base_df['fraud_bool']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
Base_df_plot_fraud_bool_status_conteo = Base_df['fraud_bool'].value_counts().reset_index()
Base_df_plot_fraud_bool_status_pc = pd.merge(Base_df_plot_fraud_bool_status, Base_df_plot_fraud_bool_status_conteo, on=['fraud_bool'], how='inner')
Base_df_plot_fraud_bool_status_pc
| fraud_bool | percent | count | |
|---|---|---|---|
| 0 | 0 | 98.8971 | 988971 |
| 1 | 1 | 1.1029 | 11029 |
Graficamos el recuento de valores. Vemos que la existencia de fraude es prácticamente nula.
fig = px.histogram(Base_df_plot_fraud_bool_status_pc, x="fraud_bool", y=['percent'])
fig.show()
base_series_null_columns = Base_df.isnull().sum().sort_values(ascending=False)
base_series_null_rows = Base_df.isnull().sum(axis=1).sort_values(ascending=False)
print(base_series_null_columns.shape, base_series_null_rows.shape)
base_null_columnas = pd.DataFrame(base_series_null_columns, columns=['nulos_columnas'])
base_null_filas = pd.DataFrame(base_series_null_rows, columns=['nulos_filas'])
base_null_filas['target'] = Base_df['fraud_bool'].copy()
base_null_columnas['porcentaje_columnas'] = base_null_columnas['nulos_columnas']/Base_df.shape[0]
base_null_filas['porcentaje_filas']= base_null_filas['nulos_filas']/Base_df.shape[1]
(32,) (1000000,)
Este código nos ayuda a detectar valores nulos (NAs). Como nuestra base de datos detecta nulos en los valores negativos y -1, nos resulta inútil. Para detectarlos, haremos lo siguiente:
filtered_data = Base_df.loc[:]
# Reemplazamos valores negativos (-1) por NaN en las columnas específicas donde hay nulos
columns_to_replace = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count', 'session_length_in_minutes', 'intended_balcon_amount']
filtered_data[columns_to_replace] = filtered_data[columns_to_replace].applymap(lambda x: np.nan if x < 0 else x)
null_values = filtered_data.isnull().sum()
null_percentage = (null_values / len(Base_df)) * 100
base_null_columnas = pd.DataFrame({'nulos_columnas': null_values, 'porcentaje_columnas': null_percentage})
print(base_null_columnas)
/var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3456504727.py:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
nulos_columnas porcentaje_columnas fraud_bool 0 0.0000 income 0 0.0000 name_email_similarity 0 0.0000 prev_address_months_count 712920 71.2920 current_address_months_count 4254 0.4254 customer_age 0 0.0000 days_since_request 0 0.0000 intended_balcon_amount 742523 74.2523 payment_type 0 0.0000 zip_count_4w 0 0.0000 velocity_6h 0 0.0000 velocity_24h 0 0.0000 velocity_4w 0 0.0000 bank_branch_count_8w 0 0.0000 date_of_birth_distinct_emails_4w 0 0.0000 employment_status 0 0.0000 credit_risk_score 0 0.0000 email_is_free 0 0.0000 housing_status 0 0.0000 phone_home_valid 0 0.0000 phone_mobile_valid 0 0.0000 bank_months_count 253635 25.3635 has_other_cards 0 0.0000 proposed_credit_limit 0 0.0000 foreign_request 0 0.0000 source 0 0.0000 session_length_in_minutes 2015 0.2015 device_os 0 0.0000 keep_alive_session 0 0.0000 device_distinct_emails_8w 0 0.0000 device_fraud_count 0 0.0000 month 0 0.0000
La cantidad de valores nulos que recogen las variables especificadas es bastante significativa. También debemos tener en cuenta que el número de registros en el dataframe es muy elevado (de 1 millón). Simplemente, vamos a tener en cuenta en el análisis y en las conclusiones que gran parte de los valores en estas variables son nulos.
base_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| fraud_bool | 0 | 0.0000 |
| income | 0 | 0.0000 |
| name_email_similarity | 0 | 0.0000 |
| prev_address_months_count | 712920 | 71.2920 |
| current_address_months_count | 4254 | 0.4254 |
| customer_age | 0 | 0.0000 |
| days_since_request | 0 | 0.0000 |
| intended_balcon_amount | 742523 | 74.2523 |
| payment_type | 0 | 0.0000 |
| zip_count_4w | 0 | 0.0000 |
| velocity_6h | 0 | 0.0000 |
| velocity_24h | 0 | 0.0000 |
| velocity_4w | 0 | 0.0000 |
| bank_branch_count_8w | 0 | 0.0000 |
| date_of_birth_distinct_emails_4w | 0 | 0.0000 |
| employment_status | 0 | 0.0000 |
| credit_risk_score | 0 | 0.0000 |
| email_is_free | 0 | 0.0000 |
| housing_status | 0 | 0.0000 |
| phone_home_valid | 0 | 0.0000 |
| phone_mobile_valid | 0 | 0.0000 |
| bank_months_count | 253635 | 25.3635 |
| has_other_cards | 0 | 0.0000 |
| proposed_credit_limit | 0 | 0.0000 |
| foreign_request | 0 | 0.0000 |
| source | 0 | 0.0000 |
| session_length_in_minutes | 2015 | 0.2015 |
| device_os | 0 | 0.0000 |
| keep_alive_session | 0 | 0.0000 |
| device_distinct_emails_8w | 0 | 0.0000 |
| device_fraud_count | 0 | 0.0000 |
| month | 0 | 0.0000 |
A continuación, se define un umbral (threshold) con un valor de 0.9, lo que significa que se considerarán solo las columnas que tengan menos del 90% de valores nulos. Vemos como en vez de recoger 32 variables recoge 29, por lo que 3 de ellas están compuestas casi en su totalidad por valores nulos.
threshold=0.9
list_vars_not_null = list(base_null_columnas[base_null_columnas['porcentaje_columnas']<threshold].index)
Base_df_filter_null = Base_df.loc[:, list_vars_not_null]
Base_df_filter_null.shape
(1000000, 29)
threshold = 0.9
list_vars_null = list(base_null_columnas[base_null_columnas['porcentaje_columnas'] >= threshold].index)
print("Variables con más del 90% de valores nulos:")
print(list_vars_null)
Variables con más del 90% de valores nulos: ['prev_address_months_count', 'intended_balcon_amount', 'bank_months_count']
base_null_filas
| nulos_filas | target | porcentaje_filas | |
|---|---|---|---|
| 0 | 0 | 1 | 0.0 |
| 666657 | 0 | 0 | 0.0 |
| 666659 | 0 | 0 | 0.0 |
| 666660 | 0 | 0 | 0.0 |
| 666661 | 0 | 0 | 0.0 |
| ... | ... | ... | ... |
| 333337 | 0 | 0 | 0.0 |
| 333338 | 0 | 0 | 0.0 |
| 333339 | 0 | 0 | 0.0 |
| 333340 | 0 | 0 | 0.0 |
| 999999 | 0 | 0 | 0.0 |
1000000 rows × 3 columns
list_cat_vars, other = dame_variables_categoricas(dataset = Base_df_filter_null)
Base_df_filter_null[list_cat_vars] = Base_df_filter_null[list_cat_vars].astype("category")
Base_df_filter_null[list_cat_vars].head()
| payment_type | employment_status | housing_status | source | device_os | |
|---|---|---|---|---|---|
| 0 | AA | CA | BA | INTERNET | windows |
| 1 | AB | CA | BA | INTERNET | windows |
| 2 | AC | CB | BA | INTERNET | other |
| 3 | AB | CA | BA | INTERNET | linux |
| 4 | AB | CA | BA | INTERNET | macintosh |
list_cat_vars
['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
Base_df_filter_null[list_cat_vars].dtypes
payment_type category employment_status category housing_status category source category device_os category dtype: object
De la variable que hemos creado con los nulos, "Base_df_filter_null" hacemos un recuento de los valores, que permanecen con la misma cantidad
Base_df_filter_null['fraud_bool'].value_counts()
fraud_bool 0 988971 1 11029 Name: count, dtype: int64
print(Base_df_filter_null["fraud_bool"].value_counts().count())
Base_df_filter_null["fraud_bool"]\
.apply(lambda x: str(x).lower().strip()).value_counts(normalize=True)
2
fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64
Lo anterior muestra la cantidad de valores únicos en la columna "fraud_bool" y la distribución de frecuencia de los valores en la columna después de normalizar los datos. A continuación hacemos lo mismo con otras variables
print(Base_df_filter_null["employment_status"].value_counts().count())
Base_df_filter_null["employment_status"]\
.apply(lambda x: str(x).lower().strip()).value_counts(normalize=True)#.count()
7
employment_status ca 0.730252 cb 0.138288 cf 0.044034 cc 0.037758 cd 0.026522 ce 0.022693 cg 0.000453 Name: proportion, dtype: float64
print(Base_df_filter_null["payment_type"].value_counts().count())
Base_df_filter_null["payment_type"]\
.apply(lambda x: str(x).lower().strip()).value_counts(normalize=True)#.count()
5
payment_type ab 0.370554 aa 0.258249 ac 0.252071 ad 0.118837 ae 0.000289 Name: proportion, dtype: float64
VALORES MISSING, OUTLIER Y CORRELACIONES¶
A continuación se realiza el estudio y preprocesamiento de las variables numéricas y categoricas. Se realizarán los siguientes pasos:
- Cambio de tipos de variables
- Separación en train y test
- Análisis de cada variable con gráficos descriptivos
- Para variables numericas: correlaciones de pearson, estudio de outliers y estudio de valores missing
- Para variables categoricas: relleno de valores missing, estudio de correlaciones con vCramer
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(Base_df=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if Base_df is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(Base_df, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = Base_df[i].mean()
series_std = Base_df[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = Base_df[i].size
perc_goods = Base_df[i][(Base_df[i] >= left) & (Base_df[i] <= right)].size/size_s
perc_excess = Base_df[i][(Base_df[i] < left) | (Base_df[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(Base_df[target][(Base_df[i] < left) | (Base_df[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('fraud_bool',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = Base_df[i][(Base_df[i] < left) | (Base_df[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(Base_df, list_var_continuous, target):
base_df_final = pd.DataFrame()
for i in list_var_continuous:
if Base_df[i].isnull().sum()>0:
base_concat_percent = pd.DataFrame(Base_df[target][Base_df[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
base_concat_percent.columns = [base_concat_percent.iloc[0,0],
base_concat_percent.iloc[0,1]]
base_concat_percent = base_concat_percent.drop('index',axis=0)
base_concat_percent['variable'] = i
base_concat_percent['sum_null_values'] = Base_df[i].isnull().sum()
base_concat_percent['porcentaje_sum_null_values'] = Base_df[i].isnull().sum()/Base_df.shape[0]
base_final = pd.concat([base_df_final, base_concat_percent], axis=0).reset_index(drop=True)
if base_df_final.empty:
print('No existen variables con valores nulos')
return base_df_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Lectura de datos para ver las columnas que componen el csv
Base_df.columns
Index(['fraud_bool', 'income', 'name_email_similarity',
'prev_address_months_count', 'current_address_months_count',
'customer_age', 'days_since_request', 'intended_balcon_amount',
'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
'velocity_4w', 'bank_branch_count_8w',
'date_of_birth_distinct_emails_4w', 'employment_status',
'credit_risk_score', 'email_is_free', 'housing_status',
'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
'session_length_in_minutes', 'device_os', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'month'],
dtype='object')
1. Cambiamos el formato de las variables¶
Recogemos en una lista las variables categóricas y las cambiamos a tipo "category"
lista_variables_categoricas = variables_categoricas = ["payment_type","employment_status","housing_status","email_is_free","phone_home_valid",
"phone_mobile_valid","has_other_cards","foreign_request","source","device_os","keep_alive_session", "fraud_bool"]
for columna in variables_categoricas:
Base_df[columna] = Base_df[columna].astype('category')
lista_variables_categoricas
['payment_type', 'employment_status', 'housing_status', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'source', 'device_os', 'keep_alive_session', 'fraud_bool']
Pasamos las variables numéricas enteras "int" a "float" (formato decimal)
Base_df['prev_address_months_count'] = Base_df['prev_address_months_count'].astype(float)
Base_df['current_address_months_count'] = Base_df['current_address_months_count'].astype(float)
Base_df['zip_count_4w'] = Base_df['zip_count_4w'].astype(float)
Base_df['bank_branch_count_8w'] = Base_df['bank_branch_count_8w'].astype(float)
Base_df['credit_risk_score'] = Base_df['credit_risk_score'].astype(float)
Base_df.dtypes
fraud_bool int64 income Float64 name_email_similarity Float64 prev_address_months_count float64 current_address_months_count float64 customer_age int64 days_since_request Float64 intended_balcon_amount Float64 payment_type float64 zip_count_4w float64 velocity_6h Float64 velocity_24h Float64 velocity_4w Float64 bank_branch_count_8w float64 date_of_birth_distinct_emails_4w int64 employment_status float64 credit_risk_score float64 email_is_free int64 housing_status float64 phone_home_valid int64 phone_mobile_valid int64 bank_months_count int64 has_other_cards int64 proposed_credit_limit Float64 foreign_request int64 source float64 session_length_in_minutes Float64 device_os float64 keep_alive_session int64 device_distinct_emails_8w int64 device_fraud_count int64 month int64 dtype: object
2. Separación train y test¶
import plotly.express as px
Base_df_plot_fraud_bool_status = Base_df['fraud_bool']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
Base_df_plot_fraud_bool_status_conteo = Base_df['fraud_bool'].value_counts().reset_index()
Base_df_plot_fraud_bool_status_pc = pd.merge(Base_df_plot_fraud_bool_status,
Base_df_plot_fraud_bool_status_conteo, on=['fraud_bool'], how='inner')
fig = px.histogram(Base_df_plot_fraud_bool_status_pc, x="fraud_bool", y=['percent'])
fig.show()
from sklearn.model_selection import train_test_split
X_base_df, X_base_df_test, y_base_df, y_base_df_test = train_test_split(Base_df.drop('fraud_bool',axis=1),
Base_df['fraud_bool'],
stratify=Base_df['fraud_bool'],
test_size=0.2)
Base_df_train = pd.concat([X_base_df, y_base_df],axis=1)
Base_df_test = pd.concat([X_base_df_test, y_base_df_test],axis=1)
/Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/sklearn/utils/validation.py:605: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/sklearn/utils/validation.py:614: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
print('== Train\n', Base_df_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', Base_df_test['fraud_bool'].value_counts(normalize=True))
== Train fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64 == Test fraud_bool 0 0.98897 1 0.01103 Name: proportion, dtype: float64
Se evidencia la partición de la variable objetivo en nuestro conjunto de datos en conjuntos de entrenamiento y prueba. Es importante notar que, aunque los porcentajes de datos en los conjuntos de entrenamiento y prueba son iguales, esto no implica que el número de registros sea el mismo (con la opción de normalización activada). Esta decisión se toma para garantizar que, en caso de que la variable objetivo esté desequilibrada, la división entre los conjuntos de entrenamiento y prueba sea equitativa.
3. Análisis de cada variable con gráficos descriptivos¶
#Vemos los nombres de las columnas que componen el dataframe
Base_df.columns
Index(['fraud_bool', 'income', 'name_email_similarity',
'prev_address_months_count', 'current_address_months_count',
'customer_age', 'days_since_request', 'intended_balcon_amount',
'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
'velocity_4w', 'bank_branch_count_8w',
'date_of_birth_distinct_emails_4w', 'employment_status',
'credit_risk_score', 'email_is_free', 'housing_status',
'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
'session_length_in_minutes', 'device_os', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'month'],
dtype='object')
- Analizamos el número de valores nulos por filas y por columnas
base_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| fraud_bool | 0 | 0.0000 |
| income | 0 | 0.0000 |
| name_email_similarity | 0 | 0.0000 |
| prev_address_months_count | 712920 | 71.2920 |
| current_address_months_count | 4254 | 0.4254 |
| customer_age | 0 | 0.0000 |
| days_since_request | 0 | 0.0000 |
| intended_balcon_amount | 742523 | 74.2523 |
| payment_type | 0 | 0.0000 |
| zip_count_4w | 0 | 0.0000 |
| velocity_6h | 0 | 0.0000 |
| velocity_24h | 0 | 0.0000 |
| velocity_4w | 0 | 0.0000 |
| bank_branch_count_8w | 0 | 0.0000 |
| date_of_birth_distinct_emails_4w | 0 | 0.0000 |
| employment_status | 0 | 0.0000 |
| credit_risk_score | 0 | 0.0000 |
| email_is_free | 0 | 0.0000 |
| housing_status | 0 | 0.0000 |
| phone_home_valid | 0 | 0.0000 |
| phone_mobile_valid | 0 | 0.0000 |
| bank_months_count | 253635 | 25.3635 |
| has_other_cards | 0 | 0.0000 |
| proposed_credit_limit | 0 | 0.0000 |
| foreign_request | 0 | 0.0000 |
| source | 0 | 0.0000 |
| session_length_in_minutes | 2015 | 0.2015 |
| device_os | 0 | 0.0000 |
| keep_alive_session | 0 | 0.0000 |
| device_distinct_emails_8w | 0 | 0.0000 |
| device_fraud_count | 0 | 0.0000 |
| month | 0 | 0.0000 |
Tenemos ya una variable a la que se le asignan las variables categóricas. A continuación creamos otra que recoja las variables numéricas. Asignamos la variable 'fraud_bool' como variable objetivo para visualizar su distribución de cara al fraude.
lista_variables_numericas = ["income", "name_email_similarity", "prev_address_months_count", "current_address_months_count", "customer_age", "days_since_request",
"intended_balcon_amount", "zip_count_4w", "velocity_6h", "velocity_24h", "velocity_4w", "bank_branch_count_8w", "date_of_birth_distinct_emails_4w",
"credit_risk_score", "bank_months_count", "proposed_credit_limit", "session_length_in_minutes", "device_distinct_emails_8w", "device_fraud_count",
"month"]
target = 'fraud_bool'
Base_df_train[target] = Base_df_train[target].astype(str)
%%time
for i in list(Base_df_train.columns):
if (Base_df_train[i].dtype==float) & (i!='fraud_bool'):
plot_feature(Base_df_train, col_name=i, isContinuous=True, target='fraud_bool')
elif i!='fraud_bool':
plot_feature(Base_df_train, col_name=i, isContinuous=False, target='fraud_bool')
/Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /var/folders/1l/q9rgmm2d30j9k7npzfz8srnr0000gn/T/ipykernel_17354/3531263144.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead /Users/ruthjorganestorres/anaconda3/envs/Practica_1/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
CPU times: user 3min 57s, sys: 10.3 s, total: 4min 8s Wall time: 4min 38s
En los gráficos anteriores observamos la relación entre las variables categóricas y númericas de cara a nuestra variable target que es fraud_bool y la cantidad de nulos. Los nulos de nuestro dataset son representados por números negativos y hemos decidido dejarlos en nuestro dataset teniéndolos en cuenta a la hora de interpretar resultados y gráficos.
Las variables categóricas tienen como segundo gráfico un gráfico de barras para ilustrar su relación con el target y las variables numéricas tienen un boxplot que ilustra la distrubución de la variable ante aplicaciones fraudulentas representadas por el color naranja y las no fraudulentas (legítimas) representadas por el color azul. A continuación estaremos interpretando la relación entre las variables frente al target:
Income: en esta variable podemos observar que la distrubución y la media de ingresos en las aplicaciones fraudulentas es más alta que en las aplicaciones legítimas. Esto puede indicar que en las aplicaciones fraudulentas se tiende a poner ingresos anuales más altos que en las legítimas para intentar conseguir una aprobación de crédito más alta.
name_email_similarity: esta variable explora la similitus entre el nombre del aplicante y su dirección de correo electrónico. En el gráfico de boxplot podemos ver que lo típico de las aplicaciones fraudulentas es que la dirección de correo no se parezca al nombre del aplicante. Entre otras razones puede deberse a que las aplicaciones fraudulentas sean generadas al azar y que ninguno de los detalles en ellas sea legítimo, en especial el nombre y correo electrónico del aplicante.
prev_address_months_count: esta variable indica la cantidad de meses que el aplicante residió en su dirección anterior. Es una variable sin una distribución distinguible porque muchas instancias tienen valores nulos. En relación a las aplicaciones legítimas podemos observar que hay mayor dispersión cerca de los primeros 10 meses, en las aplicaciones fraudulentas no hay suficientes valores para ilustrar una dispersión. Esto quiere decir que en su mayoría las instancias con aplicaciones fraudulentas no incluyen esta información.
current_address_months_count: en esta variable se indica la cantidad de meses que una persona ha vivido en su actual residencia. Podemos observar una dispersión similar entre las instancias de aplicaciones legítimas y fraudulentas pero la media en las aplicaciones fraudulentas es mayor que en las legítimas. Esto indica que en las aplicaciones fraudulentas podemos encontrar que ponen más tiempo viviendo en su residencia actual para presentarse como un candidato deseable para recibir un producto de crédito del banco.
customer_age: en esta variable podemos ver la edad del aplicante redondeada a la década más cercana. En el gráfico de barras podemos observar que las aplicaciones fraudulentas aumentan en frecuencia según aumentan las edades de los aplicantes. De esto podemos deducir que en las aplicaciones fraudulentas ponen edades que pertenecen a la población adulta mayor y de la tercera edad para no despertar sospechas en sus aplicaciones.
days_since_request: en esta variable se almacenan la cantidad de días que han pasado desde que se hizo la solicitud de crédito. Podemos ver en el gráfico de box plot que las aplicaciones fraudulentas tienden a ser contestadas más rápido que las legítimas, pensamos que puede ser porque se identifican como fraudulentas y se descartan más rápido.
intended_balcon_amount: en esta variable
payment_type: en esta variable hay 5 tipos de planes de pago de crédito anonimizados. Hay muy poca diferencia entre la cantidad de aplicaciones fraudulentas pero dónde más podemos encontrar es en el tipo de pago AC.
zip_count_4w: en esta variable podemos ver la cantidad de aplicantes registrados en el mismo código postal en las últimas cuatro semanas. La dispersión y media dentre las aplicaciones legítimas y fraudulentas registradas en el mismo código postal es similar pero podemos ver que la media de las aplicaciones fraudulentas es mayor lo que nos lleva a deducir que hay mayor repetición de códigos postales en las aplicaciones fraudulentas.
velocity_6h, velocity_24h, velocity_4w: en estas variables podemos ver la cantidad de aplicaciones por hora hechas en las últimas 6 horas, 24 horas y 4 semanas. La dispersión entre las aplicaciones legítimas y fraudulentas es similar en estas tres variables pero la media de aplicaciones legítimas es mayor. Estos resultados tienen sentido destro de nuestro dataset ya que hay una mayor cantidad de instancias que son aplicaciones legítimas.
bank_branch_count_8w: en esta variable podemos ver la cantidad total de aplicaciones en la sucursal bancaria en las últimas 8 semanas. La dispersión y media entre la cantidad de en las aplicaciones fraudulentas. Para las aplicaciones fraudulentas no hay suficientes valores para ilustrar una dispersión. Esto quiere decir que en su mayoría las instancias con aplicaciones fraudulentas no incluyen esta información.
date_of_birth_distinct_emails_4w: En esta variable vemos la cantidad de emails para aplicantes con la misma fecha de nacimiento en las últimas cuatro semanas. En el gráfico de barras podemos ver que la cantidad de aplicaciones fraudulentas que tienen la misma fecha de nacimiento son muy pocas, con la mayor cantidad concentrada en 0. Esto indica que hay pocas repeticiones en las fechas de nacimiento en las aplicaciones fraudulentas.
employment_status: esta variable esta compuesta de 7 variables anonimizadas de indican el estatus de empleo del aplicante. Según podemos ver en el gráfico de barras el valor más repetido en las aplicaciones fraudulentas es CC, lo que nos lleva a pensar que esta variable puede ser un estatus que se considera favorable para la otorgación de crédito por parte de los clientes que someten aplicaciones fraudulentas.
credit_risk_score: esta variable es un rango que indica el nivel de riesgo de la aplicación. Podemos obsservar una menor dispersión en las aplicaciones legítimas que en las fraudulentas, lo que nos lleva a pensar que el nivel de riesgo asignado es un valor confiable. Esto se confirma observando la dispersión y media de las aplicaciones fraudulentas, podemos observar que a estas aplicaciones se les asigna un nivel más alto de riesgo.
email_is_free: esta variable indica a través de un binario si el dominio del cual origina el correo electrónico es gratis o de pago. Podemos ver que no hay una diferencia significativa entre las aplicaciones fraudulentas dentro del binario de esta variable, esto indica que no hay una predisposición en esta variable entre las instancias.
housing_status: dentro de esta variable podemos ver que la distribución de instancias con aplicaciones fraudulentas esta bien distribuída entre los 7 posibles valores. El valor con mayor cantidad de aplicaciones fraudulentas es BA pero no es suficiente para sugerir que esta variable acapara la mayor parte de aplicaciones fraudulentas.
phone_home_valid, phone_mobile_valid: en estas variables podemos ver que no hay diferencia significativa en la presencia de fraude en números móvil o teléfono que sean válidos.
bank_months_count: en esta variable podemos observar que las cuentas de 17 meses son las que más presencia de aplicaciones fraudulentas tienen.
has_other_cards: en esta variable podemos ver que en el binario que indica si la persona tiene otras tarjetas con el banco que recibe la aplicación puede ser un factor a tomar en cuenta al determinar si la aplicación es fraudulenta. No hay una diferencia significativa en el binario pero podemos ver que las instancias de fraude que son graficadas pertenecen a la categoría de personas que no tienen otras tarjetas con la entidad bancaria que recibe la aplicación.
proposed_credit_limit: podemos ver en los gráficos que las aplicaciones legítimas tienen menos dispersión y menor media que las aplicaciones fraudulentas. Esto nos deja saber que una aplicación fraudulenta puede pedir cualquier límite crediticio pero que por su media se deben monitorear las aplicaciones que pidan límites que superen la norma porque este factor junto con otros puede ayudar a identificar fraude.
foreign_request: en esta variable podemos ver que no existe una diferencia significativa en las instancias de fraude entre las aplicaciones domésticas o extranjeras pero podemos ver que hay más fraude acumulado en las aplicaciones extranjeras. La procedencia de la aplicación no es suficiente para determinar si es fraudulenta o no pero es un factor a condiderar.
source: en este gráfico podemos ver que a simple vista las instancias de aplicaciones fraudulentas son las mismas entre las aplicaciones que proceden de la web y de la aplicaión móvil del banco. Esta variable no es de gran utilidad para identificar la posibilidad de fraude.
session_length_in_minutes: viendo que la dispersión y la media del largo de la sesión activa en las aplicaciones legítimas y fraudulentas es igual concluimos que esta variable no es útil en la identificación de una posible aplicación fraudulenta.
device_os: en esta variable podemos ver que el sistema operativo del dispositivo del cual origina la aplicación no tiene una significancia notable para identificar fraude. Todos los sistemas tienen una acumulación de fraude similar pero destaca el sistema operativo Windows con la mayor acumulación. Habría que observar más de cerca las aplicaciones que procedan de dispositivos con sistema operativo Windows, entre otras características.
keep_alive_session: en esta variable vemos que la acumulación de aplicaciones fraudulentas es similar entre los aplicantes que mantienen su sesión activa y los que no, pero hay más instancias de fraude entre los que deciden no mantener la sesión activa.
device_distinct_emails: en este gráfico podemos ver que los dispositivos desde los que originan aplicaciones con más de una dirección de correo electrónico acumulan la mayor cantidad de aplicaciones fraudulentas. Estaría bien mirar estas aplicaciones más de cerca ya que pueden ayudar a detectar posibles casos de fraude.
device_fraud_count: en esta variable podemos ver la cantidad de aplicaciones fraudulentas sometidas con el mismo dispositivo, pero solo tenemos valores de 0 indicando que no se recibe más de una aplicación fraudulenta desde un solo dispositivo. Esto pudiera ser porque bloquean al dispositivo de someter aplicaciones una vez se haya sometido una con fraude.
month: en esta variable podemos ver que la distrubición de las aplicaciones fraudulentas es muy similar en los meses representados en el dataset y ninguno destaca especialmente. El mes en el que se hace la aplicación no es una variable que nos ayude a determinar instancias de fraude.
Variables numéricas: valores missing, outliers y correlaciones¶
lista_variables_numericas
['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w', 'device_fraud_count', 'month']
Base_df_train
| income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | fraud_bool | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 589952 | 0.6 | 0.429674 | -1.0 | 32.0 | 30 | 0.010047 | 15.299031 | AA | 1635.0 | 8736.670001 | 5399.266061 | 4348.347034 | 8.0 | 13 | CA | 134.0 | 0 | BE | 0 | 1 | 20 | 1 | 200.0 | 0 | INTERNET | 2.276646 | other | 1 | 1 | 0 | 6 | 0 |
| 326603 | 0.9 | 0.891825 | -1.0 | 107.0 | 40 | 0.012554 | -0.644093 | AC | 1334.0 | 5688.940109 | 8144.084164 | 5518.511924 | 0.0 | 7 | CA | 63.0 | 0 | BA | 1 | 0 | -1 | 0 | 200.0 | 0 | INTERNET | 9.679169 | windows | 0 | 1 | 0 | 2 | 0 |
| 149122 | 0.9 | 0.598939 | -1.0 | 174.0 | 20 | 0.019957 | -0.919622 | AD | 2459.0 | 5604.175968 | 4860.885761 | 4796.591264 | 1.0 | 11 | CA | 244.0 | 0 | BA | 0 | 1 | 1 | 0 | 1000.0 | 0 | INTERNET | 2.463254 | windows | 1 | 1 | 0 | 3 | 0 |
| 142133 | 0.5 | 0.531510 | -1.0 | 140.0 | 60 | 0.025658 | -0.897969 | AB | 1406.0 | 7200.434217 | 5887.623549 | 4781.552066 | 2077.0 | 5 | CA | 134.0 | 1 | BC | 1 | 0 | 2 | 0 | 500.0 | 0 | INTERNET | 24.439424 | windows | 1 | 1 | 0 | 3 | 0 |
| 133021 | 0.2 | 0.083350 | 59.0 | 17.0 | 30 | 13.750229 | -0.675878 | AB | 591.0 | 9052.031397 | 6184.825380 | 4853.898984 | 1.0 | 10 | CA | 106.0 | 0 | BC | 0 | 1 | 28 | 0 | 200.0 | 0 | INTERNET | 5.931679 | windows | 1 | 1 | 0 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 783602 | 0.1 | 0.172221 | -1.0 | 74.0 | 30 | 0.003580 | -0.520913 | AB | 550.0 | 793.837064 | 4564.594295 | 4156.615833 | 14.0 | 6 | CA | 84.0 | 0 | BB | 1 | 1 | 1 | 0 | 200.0 | 0 | INTERNET | 8.853195 | other | 1 | 1 | 0 | 5 | 0 |
| 587705 | 0.9 | 0.085711 | -1.0 | 39.0 | 30 | 0.009932 | -0.688571 | AB | 885.0 | 6030.723514 | 2689.887620 | 3098.937256 | 15.0 | 6 | CA | 118.0 | 1 | BC | 0 | 1 | 1 | 0 | 200.0 | 0 | INTERNET | 11.322296 | linux | 1 | 1 | 0 | 6 | 0 |
| 816053 | 0.6 | 0.084049 | -1.0 | 39.0 | 30 | 0.012224 | 51.120107 | AA | 1092.0 | 11609.777812 | 6419.402777 | 4154.172059 | 921.0 | 14 | CA | 56.0 | 1 | BE | 0 | 1 | 2 | 1 | 1500.0 | 0 | INTERNET | 1.647088 | linux | 1 | 1 | 0 | 5 | 0 |
| 255857 | 0.5 | 0.601015 | 83.0 | 23.0 | 30 | 0.020667 | 48.512256 | AA | 1474.0 | 7355.383548 | 5242.112445 | 5594.437459 | 1.0 | 4 | CB | 201.0 | 0 | BB | 0 | 1 | 20 | 0 | 1000.0 | 0 | INTERNET | 9.120114 | linux | 0 | 1 | 0 | 2 | 0 |
| 596849 | 0.6 | 0.908472 | -1.0 | 146.0 | 40 | 0.012759 | -0.805300 | AB | 627.0 | 2178.086557 | 3270.257629 | 4138.185940 | 1367.0 | 4 | CB | 159.0 | 0 | BB | 0 | 1 | 15 | 0 | 200.0 | 0 | INTERNET | 13.147237 | other | 1 | 1 | 0 | 6 | 0 |
800000 rows × 32 columns
En la tabla anterior podemos localizar valores outlier, que no vamos a tratar por el momento.
Con la siguiente función se calcula la desviación de la media en las variables numéricas, teniendo en cuenta la función objetivo. A través del multiplicador se detectan los outliers en la columna sum_outlier_values. En este caso, vemos como la variable "bank_branch_count_8w" es la que más valores outlier presenta, con el valor más alto: 32.723
get_deviation_of_mean_perc(Base_df_train, lista_variables_numericas, target='fraud_bool', multiplier=3)
| 0 | 1 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.993783 | 0.006217 | prev_address_months_count | 20267 | 0.025334 |
| 1 | 0.984579 | 0.015421 | current_address_months_count | 17184 | 0.021480 |
| 2 | 0.957102 | 0.042898 | customer_age | 6294 | 0.007867 |
| 3 | 0.98907 | 0.01093 | days_since_request | 14273 | 0.017841 |
| 4 | 0.99011 | 0.00989 | intended_balcon_amount | 15268 | 0.019085 |
| 5 | 0.990713 | 0.009287 | zip_count_4w | 13029 | 0.016286 |
| 6 | 0.993665 | 0.006335 | velocity_6h | 3473 | 0.004341 |
| 7 | 0.995546 | 0.004454 | velocity_24h | 449 | 0.000561 |
| 8 | 0.989667 | 0.010333 | bank_branch_count_8w | 32807 | 0.041009 |
| 9 | 0.993024 | 0.006976 | date_of_birth_distinct_emails_4w | 5017 | 0.006271 |
| 10 | 0.965648 | 0.034352 | credit_risk_score | 2911 | 0.003639 |
| 11 | 0.871784 | 0.128216 | proposed_credit_limit | 4898 | 0.006123 |
| 12 | 0.980701 | 0.019299 | session_length_in_minutes | 18913 | 0.023641 |
| 13 | 0.962992 | 0.037008 | device_distinct_emails_8w | 25508 | 0.031885 |
Análisis de correlación de las variables¶
corr_matrix =get_corr_matrix(dataset = Base_df_train[lista_variables_numericas],
metodo='pearson', size_figure=[10,8])
Se aprecia una fuerte correlación entre "velocity_6h" y "velocity_24h" con la variable "Month". La relación que estas variables pueden tener entre sí podría estar relacionada con la estacionalidad o el comportamiento a lo largo del tiempo. Podría analizarse si la velocidad de las solicitudes (velocity) en las últimas 24 horas y en las últimas 4 semanas varía según el mes en el que se realizaron las aplicaciones y veríamos como influye una variable sobre la otra.
También vemos relación entre "customer_age" y "date_of_birth_distinct_emails_4w". Si hay un alto valor en "date_of_birth_distinct_emails_4w" para solicitantes con la misma fecha de nacimiento, esto podría indicar que las personas que comparten cumpleaños reciben más correos electrónicos en las últimas 4 semanas.
A continuación, vamos a analizar aquellas variables que tienen una correlación mínima del 50%, para detectar aquellas con mayor relación
corr = Base_df[lista_variables_numericas].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.5]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 372 | month | velocity_4w | 0.848100 |
| 313 | proposed_credit_limit | credit_risk_score | 0.606141 |
| 371 | month | velocity_24h | 0.549919 |
| 209 | velocity_4w | velocity_24h | 0.539115 |
Valores nulos¶
En este dataset, cualquier función para detectar nulos no será eficaz ya que en este caso adquieren valores de -1 o negativos. Es decir, no podremos detectar los valores nulos de la manera habitual, ya que detecta NAs y en estos datos no hay. He aquí el ejemplo:
get_percent_null_values_target(Base_df, lista_variables_numericas, target='fraud_bool')
No existen variables con valores nulos
Variables categóricas¶
Coeficiente de Cramer¶
A continuación tenemos una lista con todas las variables categóricas del dataset. Realizaremos el coeficiente de Cramer con cada una de ellas para estudiar si hay relación entre las mismas y la variable objetivo ('fraud_bool')
variables_categoricas
['payment_type', 'employment_status', 'housing_status', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'source', 'device_os', 'keep_alive_session', 'fraud_bool']
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["payment_type"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
payment_type AA AB AC AD AE fraud_bool 0 205602 293088 198341 93927 219 1 1091 3340 3350 1041 1
0.038745961511433426
Según el coeficiente de Crammer la asociación entre estas dos variables es débil. Esto sugiere que el tipo de pago no es un buen predictor que indique si se cometerá fraude en una transacción.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["employment_status"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
employment_status CA CB CC CD CE CF CG fraud_bool 0 577124 109852 29596 20969 18110 35164 362 1 7100 775 743 81 47 72 5
0.038875527423075075
La relación entre estas dos variables también esta considerada como débil según el coeficiente de Crammer. Esto indica que pueden haber mejores variables para predecir fraude o que la predicción de fraude puede ser más certera si se toman en cuenta otras variables a la vez que el estatus de empleo.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["housing_status"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
housing_status BA BB BC BD BE BF BG fraud_bool 0 130685 207594 295947 20764 134644 1338 205 1 5046 1282 1856 173 460 5 1
0.11363557295742692
El Coeficiente de Crammer entre estas dos variables indica que hay un asociación moderada entre ambas variables. Esto quiere decir que la variable "housing status" puede tener poder predictivo para determinar si una transacción es fraudulenta o no.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["email_is_free"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
email_is_free 0 1 fraud_bool 0 373037 418140 1 2992 5831
0.02766654939510862
El coeficiente de crammer indica que hay poca relación entre estas variables, la variable ëmail_is_free" no es un buen factor predictivo de fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["phone_home_valid"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
phone_home_valid 0 1 fraud_bool 0 459847 331330 1 6564 2259
0.034441578918114994
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que la validez de un número de teléfono no es suficiente para predecir si se cometerá fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["phone_mobile_valid"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
phone_mobile_valid 0 1 fraud_bool 0 86972 704205 1 1339 7484
0.013878204789521916
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que la validez de un número de móvil no es suficiente para predecir si se cometerá fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["has_other_cards"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
has_other_cards 0 1 fraud_bool 0 613852 177325 1 8061 762
0.0345532374987317
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que el hecho de que una persona tenga otras tarjetas de crédito no es suficiente para predecir si se cometerá fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["foreign_request"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
foreign_request 0 1 fraud_bool 0 771461 19716 1 8386 437
0.01632489159553951
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que no es suficiente si las transacciones ocurren en un país diferente al país del banco para predecir si se cometerá fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["source"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
source INTERNET TELEAPP fraud_bool 0 785652 5525 1 8729 94
0.004378247355559008
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que el origen de la aplicación (App o página web) no es suficiente para predecir si se cometerá fraude.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["device_os"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
device_os linux macintosh other windows x11 fraud_bool 0 264513 42470 272832 205628 5734 1 1385 605 1581 5189 63
0.08002228306972355
El Coeficiente de Crammer entre estas dos variables indica que hay un asociación moderada entre ambas variables. Esto quiere decir que el sistema operativo en el dispositivo electrnico del que origina la solicitud puede tener poder predictivo para determinar si una transacción es fraudulenta o no siendo el sistema operativo de Windows desde el que se comete mas fraude en este dataset.
confusion_matrix = pd.crosstab(Base_df["fraud_bool"], Base_df_train["keep_alive_session"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
keep_alive_session 0 1 fraud_bool 0 332722 458455 1 5807 3016
0.05020603874422213
La relación de estas variables es débil según el coeficiente de Crammer. Esto indica que si el cliente desea mantener la sesión activa sin cerrar o no, no es suficiente para predecir si se cometerá fraude.
En general, podemos ver que la correlación entre la variables es prácticamente nula. Aquella variable con mayor relación con la existencia o no de fraude (variable objetivo), es "housing_status", con un resultado de 0,115. Por otro lado, la variable con menor relación es "source", con un valor de 0,005.
Tratamiento de valores nulos¶
En las variables categoricas, los valores nulos se suelen sustituir por una nueva clase: "sin valor" o por la moda. En este caso, no encontramos valores nulos en las variables categóricas (solo en las numéricas). Por tanto, no realizamos ningún cambio sobre ellas
Guardado de la tabla¶
Base_df_train.to_csv("../data/train_base_data_preprocessing.csv")
Base_df_test.to_csv("../data/test_base_data_preprocessing.csv")
print(Base_df_train.shape)
print(Base_df_test.shape)
(800000, 32) (200000, 32)
Codificación de las variables categoricas, escalado y modelo¶
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
Base_df_train = pd.read_csv("../data/train_base_data_preprocessing.csv")\
.set_index(['month'])
Base_df_test = pd.read_csv("../data/test_base_data_preprocessing.csv")\
.set_index(['month'])
Base_df_train.columns
Index(['Unnamed: 0', 'income', 'name_email_similarity',
'prev_address_months_count', 'current_address_months_count',
'customer_age', 'days_since_request', 'intended_balcon_amount',
'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
'velocity_4w', 'bank_branch_count_8w',
'date_of_birth_distinct_emails_4w', 'employment_status',
'credit_risk_score', 'email_is_free', 'housing_status',
'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
'session_length_in_minutes', 'device_os', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'fraud_bool'],
dtype='object')
Base_df.dtypes
fraud_bool category income float64 name_email_similarity float64 prev_address_months_count float64 current_address_months_count float64 customer_age int64 days_since_request float64 intended_balcon_amount float64 payment_type category zip_count_4w float64 velocity_6h float64 velocity_24h float64 velocity_4w float64 bank_branch_count_8w float64 date_of_birth_distinct_emails_4w int64 employment_status category credit_risk_score float64 email_is_free category housing_status category phone_home_valid category phone_mobile_valid category bank_months_count int64 has_other_cards category proposed_credit_limit float64 foreign_request category source category session_length_in_minutes float64 device_os category keep_alive_session category device_distinct_emails_8w int64 device_fraud_count int64 month int64 dtype: object
Conclusiones:
En este ejercicio pudimos explorar los datos del EDA dataset y prepararlos para usarlos como entrenamiento de un modelo predictivo de fraude en el futuro. Escogimos nuestra variable target y exploramos la correlación de esta con las otras variables dentro del dataset. Escogimos como target la variable binaria "fraus_bool"que identifica las aplicaciones de crédito bancarias fraudulentas con un 1 y las aplicaciones legítimas con un 0.
Hicimos un análisis en el que vimos la acumulación de fraude entre el resto de variables en el dataset para identificar variables significativas a la hora de identificar fraude. Entre las variables que identificamos como significativas en este ejercicio están "proposed_credit_limit", "bank_months_count", "credit_risk_score" e "income".
Además, vimos el nivel de correlación de las variables ante nuestro target utilizando el coeficiente de Crammer. Las variables con mayor correlación a la variable target fueron "housing_status" y "device_os".
Durante nuestra exploración vimos los valores nulos dentro de cada variable de nuestro dataset identificados por valores numéricos negativos. Hemos decidido mantenerlos en nuestro dataset y formular conclusiones tomándolos en cuenta.
Esperamos que con este ejercicio exploratorio nuestros datos estén en condiciones óptimas para entrenar un modelo predictivo en el futuro.